Project: Wrangling and Analyze Data¶

Table of Contents¶

  • Data Gathering
  • Assessing Data
  • Cleaning Data
  • Storing Data
  • Analyzing and Visualizing Data
In [1]:
import pandas as pd
import requests
import os
import json
import numpy as np

Data Gathering¶

WeRateDogs Twitter archive data

In [2]:
archive = pd.read_csv('twitter-archive-enhanced.csv')

Tweet image predictions

In [3]:
# Getting the image predictions from Udacity server
url = ' https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
response
Out[3]:
<Response [200]>
In [4]:
folder_name = 'Image_predictions'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
In [5]:
# Saving the obtained data
with open(os.path.join(folder_name, url.split('/')[-1]), mode = 'wb') as file:
    file.write(response.content)
In [6]:
Image_predictions = pd.read_csv('image-predictions.tsv', sep = '\t')

Additional data from the twitter API

In [7]:
#import tweepy
#from tweepy import OAuthHandler
#from timeit import default_timer as timer

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
#consumer_key = 'HIDDEN'
#consumer_secret = 'HIDDEN'
#access_token = 'HIDDEN'
#access_secret = 'HIDDEN'

#auth = OAuthHandler(consumer_key, consumer_secret)
#auth.set_access_token(access_token, access_secret)

#api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
#archive = pd.read_csv('twitter-archive-enhanced.csv')
#tweet_ids = archive.tweet_id.values
#len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
#count = 0
#fails_dict = {}
#start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
#with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    #for tweet_id in tweet_ids:
        #count += 1
        #print(str(count) + ": " + str(tweet_id))
        #try:
            #tweet = api.get_status(tweet_id, tweet_mode='extended')
            #print("Success")
            #json.dump(tweet._json, outfile)
            #outfile.write('\n')
        #except AttributeError:
            #print("Fail")
            #fails_dict[tweet_id] = e
            #pass
#end = timer()
#print(end - start)
#print(fails_dict)
In [8]:
#list of dictionaries to build file by file and later convert to DataFrame

df_list = []

with open ('tweet-json.txt', 'r') as file:
    for line in file:
        data = json.loads(line)
        tweet_id = data['id']
        created_at = data['created_at']
        retweeted_status = data['retweeted']
        favorite_count = data['favorite_count']
        retweet_count = data['retweet_count']
        #Append to list of dictionaries
        df_list.append({'tweet_id' :tweet_id,
                       'created_at': created_at,
                        'retweeted': retweeted_status,
                       'favorite_count': favorite_count,
                       'retweet_count': retweet_count})
        
        
tweet_json = pd.DataFrame(df_list, columns = ['tweet_id','favorite_count','retweet_count'])
        

Assessing Data¶

In [9]:
archive
Out[9]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None None None None
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None None None None
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... 12 10 Archie None None None None
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... 13 10 Darla None None None None
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... 12 10 Franklin None None None None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2351 666049248165822465 NaN NaN 2015-11-16 00:24:50 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a 1949 1st generation vulpix. Enj... NaN NaN NaN https://twitter.com/dog_rates/status/666049248... 5 10 None None None None None
2352 666044226329800704 NaN NaN 2015-11-16 00:04:52 +0000 <a href="http://twitter.com/download/iphone" r... This is a purebred Piers Morgan. Loves to Netf... NaN NaN NaN https://twitter.com/dog_rates/status/666044226... 6 10 a None None None None
2353 666033412701032449 NaN NaN 2015-11-15 23:21:54 +0000 <a href="http://twitter.com/download/iphone" r... Here is a very happy pup. Big fan of well-main... NaN NaN NaN https://twitter.com/dog_rates/status/666033412... 9 10 a None None None None
2354 666029285002620928 NaN NaN 2015-11-15 23:05:30 +0000 <a href="http://twitter.com/download/iphone" r... This is a western brown Mitsubishi terrier. Up... NaN NaN NaN https://twitter.com/dog_rates/status/666029285... 7 10 a None None None None
2355 666020888022790149 NaN NaN 2015-11-15 22:32:08 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a Japanese Irish Setter. Lost eye... NaN NaN NaN https://twitter.com/dog_rates/status/666020888... 8 10 None None None None None

2356 rows × 17 columns

In [10]:
Image_predictions
Out[10]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German_shepherd 0.596461 True malinois 0.138584 True bloodhound 0.116197 True
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian_ridgeback 0.408143 True redbone 0.360687 True miniature_pinscher 0.222752 True
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 miniature_pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True
... ... ... ... ... ... ... ... ... ... ... ... ...
2070 891327558926688256 https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2 basset 0.555712 True English_springer 0.225770 True German_short-haired_pointer 0.175219 True
2071 891689557279858688 https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1 paper_towel 0.170278 False Labrador_retriever 0.168086 True spatula 0.040836 False
2072 891815181378084864 https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1 Chihuahua 0.716012 True malamute 0.078253 True kelpie 0.031379 True
2073 892177421306343426 https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1 Chihuahua 0.323581 True Pekinese 0.090647 True papillon 0.068957 True
2074 892420643555336193 https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1 orange 0.097049 False bagel 0.085851 False banana 0.076110 False

2075 rows × 12 columns

In [11]:
tweet_json
Out[11]:
tweet_id favorite_count retweet_count
0 892420643555336193 39467 8853
1 892177421306343426 33819 6514
2 891815181378084864 25461 4328
3 891689557279858688 42908 8964
4 891327558926688256 41048 9774
... ... ... ...
2349 666049248165822465 111 41
2350 666044226329800704 311 147
2351 666033412701032449 128 47
2352 666029285002620928 132 48
2353 666020888022790149 2535 532

2354 rows × 3 columns

In [12]:
archive.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 non-null   object 
 14  floofer                     2356 non-null   object 
 15  pupper                      2356 non-null   object 
 16  puppo                       2356 non-null   object 
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB
In [13]:
Image_predictions.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
In [14]:
tweet_json.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   tweet_id        2354 non-null   int64
 1   favorite_count  2354 non-null   int64
 2   retweet_count   2354 non-null   int64
dtypes: int64(3)
memory usage: 55.3 KB
In [15]:
archive.tweet_id.duplicated().sum()
Out[15]:
0
In [16]:
archive['in_reply_to_status_id'].notnull().sum()
Out[16]:
78
In [17]:
archive['retweeted_status_id'].notnull().sum()
Out[17]:
181
In [18]:
archive.name.value_counts().head(60)
Out[18]:
None        745
a            55
Charlie      12
Cooper       11
Lucy         11
Oliver       11
Tucker       10
Penny        10
Lola         10
Winston       9
Bo            9
Sadie         8
the           8
Daisy         7
Buddy         7
Toby          7
an            7
Bailey        7
Leo           6
Oscar         6
Stanley       6
Rusty         6
Bella         6
Dave          6
Jack          6
Koda          6
Scout         6
Jax           6
Milo          6
Louis         5
Phil          5
Sammy         5
Gus           5
Chester       5
Alfie         5
Sunny         5
Bentley       5
very          5
Oakley        5
Finn          5
George        5
Larry         5
Clarence      4
Loki          4
Walter        4
Reggie        4
Hank          4
Scooter       4
Duke          4
quite         4
Reginald      4
Maggie        4
Luna          4
Maddie        4
Dexter        4
Jerry         4
Carl          4
Ruby          4
Riley         4
Clark         4
Name: name, dtype: int64
In [19]:
archive.rating_numerator.describe()
Out[19]:
count    2356.000000
mean       13.126486
std        45.876648
min         0.000000
25%        10.000000
50%        11.000000
75%        12.000000
max      1776.000000
Name: rating_numerator, dtype: float64
In [20]:
archive.rating_numerator.unique()
Out[20]:
array([  13,   12,   14,    5,   17,   11,   10,  420,  666,    6,   15,
        182,  960,    0,   75,    7,   84,    9,   24,    8,    1,   27,
          3,    4,  165, 1776,  204,   50,   99,   80,   45,   60,   44,
        143,  121,   20,   26,    2,  144,   88], dtype=int64)
In [21]:
archive.rating_denominator.describe()
Out[21]:
count    2356.000000
mean       10.455433
std         6.745237
min         0.000000
25%        10.000000
50%        10.000000
75%        10.000000
max       170.000000
Name: rating_denominator, dtype: float64
In [22]:
archive.rating_denominator.unique()
Out[22]:
array([ 10,   0,  15,  70,   7,  11, 150, 170,  20,  50,  90,  80,  40,
       130, 110,  16, 120,   2], dtype=int64)
In [23]:
Image_predictions.describe()
Out[23]:
tweet_id img_num p1_conf p2_conf p3_conf
count 2.075000e+03 2075.000000 2075.000000 2.075000e+03 2.075000e+03
mean 7.384514e+17 1.203855 0.594548 1.345886e-01 6.032417e-02
std 6.785203e+16 0.561875 0.271174 1.006657e-01 5.090593e-02
min 6.660209e+17 1.000000 0.044333 1.011300e-08 1.740170e-10
25% 6.764835e+17 1.000000 0.364412 5.388625e-02 1.622240e-02
50% 7.119988e+17 1.000000 0.588230 1.181810e-01 4.944380e-02
75% 7.932034e+17 1.000000 0.843855 1.955655e-01 9.180755e-02
max 8.924206e+17 4.000000 1.000000 4.880140e-01 2.734190e-01
In [24]:
tweet_json.describe()
Out[24]:
tweet_id favorite_count retweet_count
count 2.354000e+03 2354.000000 2354.000000
mean 7.426978e+17 8080.968564 3164.797366
std 6.852812e+16 11814.771334 5284.770364
min 6.660209e+17 0.000000 0.000000
25% 6.783975e+17 1415.000000 624.500000
50% 7.194596e+17 3603.500000 1473.500000
75% 7.993058e+17 10122.250000 3652.000000
max 8.924206e+17 132810.000000 79515.000000

Tidiness¶

  • dog stages should be in one column
  • all dataframes should be in one table

Quality¶

  • tweet_id is a string not an int in the 3 datasets ##### archive table
  • No retweets or replys are needed for this analysis
  • timestamp is a datetime not object/ Column rename
  • Dog's names are missing,misspelling or invalid
  • Rating denominator less than 10 and incorrect Rating numerator ##### Image_predictions table
  • Columns's names are not descriptive
  • Dogs breeds sometimes lowercase others uppercase and the presence of the underscore
  • Dog breed is a category not an object

Cleaning Data¶

In [25]:
# Make copies of original pieces of data
archive_clean = archive.copy()
Image_predictions_clean = Image_predictions.copy()
tweet_json_clean = tweet_json.copy()

Quality issue:¶

Issue #1:¶

  • Inaccurate data: No retweets or replys are needed for this analysis

Define:¶

  • use drop function to delete retweets and replys since we need only original tweets

Code¶

In [26]:
archive_clean.drop(archive_clean[(archive_clean['in_reply_to_status_id'].notnull()) ].index, inplace = True)
archive_clean.drop(archive_clean[(archive_clean['retweeted_status_id'].notnull()) ].index, inplace = True)

Test¶

In [27]:
archive_clean['in_reply_to_status_id'].notnull().sum()
Out[27]:
0
In [28]:
archive_clean['retweeted_status_id'].notnull().sum()
Out[28]:
0
  • drop unnecessary columns for the analysis: Drop columns ( in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp )
In [29]:
archive_clean.drop(['in_reply_to_status_id','in_reply_to_user_id','source', 'retweeted_status_id', 'retweeted_status_user_id','retweeted_status_timestamp' ], axis=1, inplace=True)
In [30]:
archive_clean.shape
Out[30]:
(2097, 11)

Issue #2:¶

  • tweet_id data type

Define:¶

  • Convert tweet_id to string in the three data sets
Code¶
In [31]:
# tweet_id to string 
archive_clean['tweet_id'] = archive_clean['tweet_id'].astype(str)
Image_predictions_clean['tweet_id'] = Image_predictions_clean['tweet_id'].astype(str)
tweet_json_clean['tweet_id'] = tweet_json_clean['tweet_id'].astype(str)
Test¶
In [32]:
archive_clean.tweet_id.dtypes, Image_predictions_clean.tweet_id.dtypes , tweet_json_clean.tweet_id.dtypes
Out[32]:
(dtype('O'), dtype('O'), dtype('O'))

Issue #3:¶

  • timestamp data type

Define:¶

  • Convert timestamp to datetime and rename the column to 'tweet_date'

Code:¶

In [33]:
# timestamp to datetime
archive_clean['timestamp'] = pd.to_datetime(archive_clean['timestamp'])
archive_clean.rename(columns={'timestamp':"tweet_date"}, inplace=True)

Test:¶

In [34]:
archive_clean.tweet_date.dtypes
Out[34]:
datetime64[ns, UTC]

Tidiness issue¶

Issue1¶

  • Dog stages data in one column
Define:¶
  • melt columns doggo, floofer, pupper and puppo
  • clean the new data column
Code:¶
In [35]:
archive_clean['dog_stages'] = archive_clean['doggo'] + archive_clean['floofer'] + archive_clean['pupper'] + archive_clean['puppo']
In [36]:
# Replace None with empty string
empty_string = lambda x: x.replace("None", "")

archive_clean['dog_stages']= archive_clean['dog_stages'].apply(empty_string)
Test:¶
In [37]:
archive_clean.dog_stages.value_counts()
Out[37]:
                1761
pupper           221
doggo             72
puppo             23
floofer            9
doggopupper        9
doggopuppo         1
doggofloofer       1
Name: dog_stages, dtype: int64
In [38]:
# Drop 'doggo', 'floofer', 'pupper', 'puppo' columns
archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo' ], axis=1, inplace=True)
Clean dog stages column¶
In [39]:
archive_clean[archive_clean['dog_stages']=='doggopupper']
Out[39]:
tweet_id tweet_date text expanded_urls rating_numerator rating_denominator name dog_stages
460 817777686764523521 2017-01-07 16:59:28+00:00 This is Dido. She's playing the lead role in "... https://twitter.com/dog_rates/status/817777686... 13 10 Dido doggopupper
531 808106460588765185 2016-12-12 00:29:28+00:00 Here we have Burke (pupper) and Dexter (doggo)... https://twitter.com/dog_rates/status/808106460... 12 10 None doggopupper
575 801115127852503040 2016-11-22 17:28:25+00:00 This is Bones. He's being haunted by another d... https://twitter.com/dog_rates/status/801115127... 12 10 Bones doggopupper
705 785639753186217984 2016-10-11 00:34:48+00:00 This is Pinot. He's a sophisticated doggo. You... https://twitter.com/dog_rates/status/785639753... 10 10 Pinot doggopupper
733 781308096455073793 2016-09-29 01:42:20+00:00 Pupper butt 1, Doggo 0. Both 12/10 https://t.c... https://vine.co/v/5rgu2Law2ut 12 10 None doggopupper
889 759793422261743616 2016-07-31 16:50:42+00:00 Meet Maggie &amp; Lila. Maggie is the doggo, L... https://twitter.com/dog_rates/status/759793422... 12 10 Maggie doggopupper
956 751583847268179968 2016-07-09 01:08:47+00:00 Please stop sending it pictures that don't eve... https://twitter.com/dog_rates/status/751583847... 5 10 None doggopupper
1063 741067306818797568 2016-06-10 00:39:48+00:00 This is just downright precious af. 12/10 for ... https://twitter.com/dog_rates/status/741067306... 12 10 just doggopupper
1113 733109485275860992 2016-05-19 01:38:16+00:00 Like father (doggo), like son (pupper). Both 1... https://twitter.com/dog_rates/status/733109485... 12 10 None doggopupper
In [40]:
# After visual assessment Pinot is not a dog 
archive_clean.drop([705], inplace=True)
In [41]:
archive_clean[archive_clean.name=='Pinot']
Out[41]:
tweet_id tweet_date text expanded_urls rating_numerator rating_denominator name dog_stages
In [42]:
# After visual assessment Dido is a Pupper, will be changed manually
archive_clean.at[460,'dog_stages']='pupper'
In [43]:
archive_clean.loc[460, : ]
Out[43]:
tweet_id                                             817777686764523521
tweet_date                                    2017-01-07 16:59:28+00:00
text                  This is Dido. She's playing the lead role in "...
expanded_urls         https://twitter.com/dog_rates/status/817777686...
rating_numerator                                                     13
rating_denominator                                                   10
name                                                               Dido
dog_stages                                                       pupper
Name: 460, dtype: object
In [44]:
# After visual assessment Bones is a Pupper, will be changed manually
archive_clean.at[575,'dog_stages']='pupper'
In [45]:
archive_clean.loc[575, : ]
Out[45]:
tweet_id                                             801115127852503040
tweet_date                                    2016-11-22 17:28:25+00:00
text                  This is Bones. He's being haunted by another d...
expanded_urls         https://twitter.com/dog_rates/status/801115127...
rating_numerator                                                     12
rating_denominator                                                   10
name                                                              Bones
dog_stages                                                       pupper
Name: 575, dtype: object
In [46]:
archive_clean[archive_clean['dog_stages']=='doggopuppo']
Out[46]:
tweet_id tweet_date text expanded_urls rating_numerator rating_denominator name dog_stages
191 855851453814013952 2017-04-22 18:31:02+00:00 Here's a puppo participating in the #ScienceMa... https://twitter.com/dog_rates/status/855851453... 13 10 None doggopuppo
In [47]:
# After visual assessment this dog is a puppo
archive_clean.at[191,'dog_stages']='puppo'
In [48]:
archive_clean.loc[191, : ]
Out[48]:
tweet_id                                             855851453814013952
tweet_date                                    2017-04-22 18:31:02+00:00
text                  Here's a puppo participating in the #ScienceMa...
expanded_urls         https://twitter.com/dog_rates/status/855851453...
rating_numerator                                                     13
rating_denominator                                                   10
name                                                               None
dog_stages                                                        puppo
Name: 191, dtype: object
In [49]:
archive_clean[archive_clean['dog_stages']=='doggofloofer']
Out[49]:
tweet_id tweet_date text expanded_urls rating_numerator rating_denominator name dog_stages
200 854010172552949760 2017-04-17 16:34:26+00:00 At first I thought this was a shy doggo, but i... https://twitter.com/dog_rates/status/854010172... 11 10 None doggofloofer
In [50]:
# After visual assessment
archive_clean.at[200,'dog_stages']='floofer'
In [51]:
archive_clean.loc[200, : ]
Out[51]:
tweet_id                                             854010172552949760
tweet_date                                    2017-04-17 16:34:26+00:00
text                  At first I thought this was a shy doggo, but i...
expanded_urls         https://twitter.com/dog_rates/status/854010172...
rating_numerator                                                     11
rating_denominator                                                   10
name                                                               None
dog_stages                                                      floofer
Name: 200, dtype: object
In [52]:
# After visual assessment, all other doggopupper stages are actually for two dogs , we will seperate it with a ','
archive_clean['dog_stages'] = np.where(archive_clean['dog_stages']=='doggopupper', 'doggo,pupper', archive_clean['dog_stages'])
In [53]:
archive_clean.dog_stages.value_counts()
Out[53]:
                1761
pupper           223
doggo             72
puppo             24
floofer           10
doggo,pupper       6
Name: dog_stages, dtype: int64

Quality issues¶

Issue #4:¶

  • Dogs's names are missing,misspelling or invalid
Define:¶
  • extract right names from text using the regular expressions of ' This is ... ' and '.named..'
  • extract invalid names
Code:¶
In [54]:
# Replace invalid names extracted from the name column with a nan value
archive_clean.name.replace(regex=r'(^[a-z]+)',value=np.nan, inplace=True )
In [55]:
# Extract dogs's names after 'This is ...'
archive_clean['name_after_Thisis'] = archive_clean['text'].str.extract('This\sis\s([a-zA-Z]+)', expand=True)

# Delete not valid names after 'This is ...'
archive_clean.name_after_Thisis.replace(regex=r'(^[a-z]+)',value=np.nan, inplace=True )
In [56]:
# Extract dogs's names after 'named ...'
archive_clean['name_after_named'] = archive_clean['text'].str.extract('named ([A-Za-z]+)\.', expand=True)
In [57]:
# Create new name column with the right names extracted from text
archive_clean['Names'] =archive_clean['name_after_Thisis'].fillna('') + archive_clean['name_after_named'].fillna('')

#Drop existibg name column
archive_clean.drop(['name','name_after_Thisis','name_after_named'  ], axis=1, inplace=True)
Test:¶
In [58]:
# Verify if there are any invalid names left
invalid_names = archive_clean['Names'].str.extract('(^[a-z]+)', expand=True)
invalid_names.value_counts()
Out[58]:
Series([], dtype: int64)
In [59]:
archive_clean['Names'].value_counts().head(60)
Out[59]:
            997
Lucy         10
Cooper        9
Tucker        8
Charlie       8
Penny         8
Oliver        8
Lola          6
Bella         6
Oscar         6
Bo            6
Louis         5
Scout         5
Buddy         5
Bailey        5
Koda          5
Finn          4
Daisy         4
Derek         4
Chip          4
Jerry         4
Toby          4
Leo           4
Jeffrey       4
Stanley       4
Bentley       4
Sadie         4
Clark         4
Alfie         4
Cassie        4
Dexter        4
Archie        4
Gus           4
Wallace       3
Duke          3
Winnie        3
Milo          3
Mia           3
Walter        3
Ruby          3
Brody         3
Maximus       3
Dave          3
Oakley        3
Riley         3
Carl          3
Kyle          3
Sophie        3
Wyatt         3
Rosie         3
Reginald      3
Phil          3
Bruce         3
Jimothy       3
Bear          3
Waffles       3
Klevin        3
Rusty         3
Winston       3
Chester       3
Name: Names, dtype: int64
In [60]:
list(archive_clean)
Out[60]:
['tweet_id',
 'tweet_date',
 'text',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'dog_stages',
 'Names']

Issue #5:¶

Define¶

  • extract the rating numerator from the text
  • extract rating denominator to adjusr rating numerator
  • drop rating denominator since its all equal to 10

Code:¶

In [61]:
# Extract rating numerator from text
archive_clean['rating_numerator_from_text'] = archive_clean['text'].str.extract('(\d+\.*\d*)\/\d+', expand=True)

archive_clean['rating_numerator_from_text'].value_counts()
Out[61]:
12       486
10       435
11       413
13       287
9        153
8         98
7         52
14        38
5         33
6         32
3         19
4         16
2          9
1          5
11.27      1
9.75       1
88         1
144        1
11.26      1
121        1
44         1
60         1
45         1
80         1
99         1
50         1
204        1
1776       1
165        1
13.5       1
0          1
84         1
24         1
420        1
Name: rating_numerator_from_text, dtype: int64
In [62]:
archive_clean['rating_denominator_from_text'] = archive_clean['text'].str.extract('\d+\.*\d*/(\d+)', expand=True)
archive_clean['rating_denominator_from_text'].value_counts()
Out[62]:
10     2079
50        3
11        2
80        2
70        1
7         1
150       1
170       1
20        1
90        1
40        1
110       1
120       1
2         1
Name: rating_denominator_from_text, dtype: int64
In [63]:
archive_clean['rating_denominator_from_text']=archive_clean['rating_denominator_from_text'].astype('float')
archive_clean['rating_numerator_from_text']=archive_clean['rating_numerator_from_text'].astype('float')
In [64]:
# Drop non dog rates after visual assessment
archive_clean.drop([2091], inplace=True)
archive_clean.drop([2261], inplace=True)
archive_clean.drop([2338], inplace=True)
archive_clean.drop([315], inplace=True)
archive_clean.drop([1761], inplace=True)
archive_clean.drop([2079], inplace=True)
archive_clean.drop([2237], inplace=True)
archive_clean.drop([2349], inplace=True)
archive_clean.drop([2074], inplace=True)
In [65]:
# Change ratting numerator after visual assessment
archive_clean.at[2335,'rating_numerator_from_text']=9
archive_clean.at[1202,'rating_numerator_from_text']=11
In [66]:
archive_clean.loc[1202, : ]
Out[66]:
tweet_id                                                       716439118184652801
tweet_date                                              2016-04-03 01:36:11+00:00
text                            This is Bluebert. He just saw that both #Final...
expanded_urls                   https://twitter.com/dog_rates/status/716439118...
rating_numerator                                                               50
rating_denominator                                                             50
dog_stages                                                                       
Names                                                                    Bluebert
rating_numerator_from_text                                                   11.0
rating_denominator_from_text                                                 50.0
Name: 1202, dtype: object
In [67]:
# Logan is a gryffindor dog, that's why he has a 'Nine and three quarters' rating, but actually he is a 13, will be changed manually
archive_clean.at[695,'rating_numerator_from_text']=13
In [68]:
archive_clean.loc[695, : ]
Out[68]:
tweet_id                                                       786709082849828864
tweet_date                                              2016-10-13 23:23:56+00:00
text                            This is Logan, the Chow who lived. He solemnly...
expanded_urls                   https://twitter.com/dog_rates/status/786709082...
rating_numerator                                                               75
rating_denominator                                                             10
dog_stages                                                                       
Names                                                                       Logan
rating_numerator_from_text                                                   13.0
rating_denominator_from_text                                                 10.0
Name: 695, dtype: object
In [69]:
# This is not a rating 
archive_clean.rating_numerator_from_text.replace (24.00,value=np.nan, inplace=True )
archive_clean.rating_denominator_from_text.replace (7,value=np.nan, inplace=True )
In [70]:
# Normalize the rating of groups to have a rating denominator equal to 10 for all tweets
archive_clean['Rating_numerator'] = (archive_clean['rating_numerator_from_text']/archive_clean['rating_denominator_from_text'])*10
In [71]:
# Drop unnecessary columns 
archive_clean.drop(['rating_numerator_from_text','rating_denominator_from_text','rating_numerator' ,'rating_denominator' ], axis=1, inplace=True)

Test:¶

In [72]:
archive_clean['Rating_numerator'].value_counts()
Out[72]:
12.000000      490
10.000000      436
11.000000      418
13.000000      288
9.000000       153
8.000000        98
7.000000        51
14.000000       38
5.000000        33
6.000000        32
3.000000        19
4.000000        15
2.000000         6
11.270000        1
13.500000        1
1776.000000      1
8.181818         1
2.200000         1
6.363636         1
11.260000        1
1.000000         1
45.000000        1
Name: Rating_numerator, dtype: int64
In [73]:
list(archive_clean)
Out[73]:
['tweet_id',
 'tweet_date',
 'text',
 'expanded_urls',
 'dog_stages',
 'Names',
 'Rating_numerator']

Issue #6:¶

Define:¶

  • Rename the columns's names for more clearity

Code:¶

In [74]:
Image_predictions_clean = Image_predictions_clean.rename(columns={
                     'p1':'Image_prediction1', 'p1_conf':'confirmation1', 'p1_dog':'Dog_prediction1',
                     'p2':'Image_prediction2', 'p2_conf':'confirmation2', 'p2_dog':'Dog_prediction2',                                                            
                     'p3':'Image_prediction3', 'p3_conf':'confirmation3', 'p3_dog':'Dog_prediction3'                                                            
                                                                  })  

Test:¶

In [75]:
list(Image_predictions_clean)
Out[75]:
['tweet_id',
 'jpg_url',
 'img_num',
 'Image_prediction1',
 'confirmation1',
 'Dog_prediction1',
 'Image_prediction2',
 'confirmation2',
 'Dog_prediction2',
 'Image_prediction3',
 'confirmation3',
 'Dog_prediction3']

Issue #7:¶

  • Dogs breeds sometimes lowercase others uppercase and the underscore

Define:¶

  • Capitalise the dog prediction data and replace the underscore'_' wuth a space

Code:¶

In [76]:
Image_predictions_clean['Image_prediction1'] = Image_predictions_clean['Image_prediction1'].str.title().str.replace('_', ' ')
Image_predictions_clean['Image_prediction2'] = Image_predictions_clean['Image_prediction2'].str.title().str.replace('_', ' ')
Image_predictions_clean['Image_prediction3'] = Image_predictions_clean['Image_prediction3'].str.title().str.replace('_', ' ')

Test:¶

In [77]:
Image_predictions_clean
Out[77]:
tweet_id jpg_url img_num Image_prediction1 confirmation1 Dog_prediction1 Image_prediction2 confirmation2 Dog_prediction2 Image_prediction3 confirmation3 Dog_prediction3
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh Springer Spaniel 0.465074 True Collie 0.156665 True Shetland Sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 Redbone 0.506826 True Miniature Pinscher 0.074192 True Rhodesian Ridgeback 0.072010 True
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German Shepherd 0.596461 True Malinois 0.138584 True Bloodhound 0.116197 True
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian Ridgeback 0.408143 True Redbone 0.360687 True Miniature Pinscher 0.222752 True
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 Miniature Pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True
... ... ... ... ... ... ... ... ... ... ... ... ...
2070 891327558926688256 https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2 Basset 0.555712 True English Springer 0.225770 True German Short-Haired Pointer 0.175219 True
2071 891689557279858688 https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1 Paper Towel 0.170278 False Labrador Retriever 0.168086 True Spatula 0.040836 False
2072 891815181378084864 https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1 Chihuahua 0.716012 True Malamute 0.078253 True Kelpie 0.031379 True
2073 892177421306343426 https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1 Chihuahua 0.323581 True Pekinese 0.090647 True Papillon 0.068957 True
2074 892420643555336193 https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1 Orange 0.097049 False Bagel 0.085851 False Banana 0.076110 False

2075 rows × 12 columns

Issue #8:¶

Define:¶

  • Convert the image predictions to category

Code:¶

In [78]:
Image_predictions_clean['Image_prediction1'] = Image_predictions_clean['Image_prediction1'].astype('category')
Image_predictions_clean['Image_prediction2'] = Image_predictions_clean['Image_prediction2'].astype('category')
Image_predictions_clean['Image_prediction3'] = Image_predictions_clean['Image_prediction3'].astype('category')

Test:¶

In [79]:
Image_predictions_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   tweet_id           2075 non-null   object  
 1   jpg_url            2075 non-null   object  
 2   img_num            2075 non-null   int64   
 3   Image_prediction1  2075 non-null   category
 4   confirmation1      2075 non-null   float64 
 5   Dog_prediction1    2075 non-null   bool    
 6   Image_prediction2  2075 non-null   category
 7   confirmation2      2075 non-null   float64 
 8   Dog_prediction2    2075 non-null   bool    
 9   Image_prediction3  2075 non-null   category
 10  confirmation3      2075 non-null   float64 
 11  Dog_prediction3    2075 non-null   bool    
dtypes: bool(3), category(3), float64(3), int64(1), object(2)
memory usage: 165.4+ KB

Tidiness issue¶

Issue2¶

  • All datasets should forme one dataset
Define:¶
  • Merge all datasets
Code:¶
In [80]:
twitter_archive = pd.merge(archive_clean, Image_predictions_clean, on= 'tweet_id', how='left')
In [81]:
twitter_archive_master = pd.merge(twitter_archive, tweet_json_clean, on= 'tweet_id', how='left')
Test:¶
In [82]:
twitter_archive_master.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2087 entries, 0 to 2086
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   tweet_id           2087 non-null   object             
 1   tweet_date         2087 non-null   datetime64[ns, UTC]
 2   text               2087 non-null   object             
 3   expanded_urls      2084 non-null   object             
 4   dog_stages         2087 non-null   object             
 5   Names              2087 non-null   object             
 6   Rating_numerator   2086 non-null   float64            
 7   jpg_url            1961 non-null   object             
 8   img_num            1961 non-null   float64            
 9   Image_prediction1  1961 non-null   category           
 10  confirmation1      1961 non-null   float64            
 11  Dog_prediction1    1961 non-null   object             
 12  Image_prediction2  1961 non-null   category           
 13  confirmation2      1961 non-null   float64            
 14  Dog_prediction2    1961 non-null   object             
 15  Image_prediction3  1961 non-null   category           
 16  confirmation3      1961 non-null   float64            
 17  Dog_prediction3    1961 non-null   object             
 18  favorite_count     2087 non-null   int64              
 19  retweet_count      2087 non-null   int64              
dtypes: category(3), datetime64[ns, UTC](1), float64(5), int64(2), object(9)
memory usage: 355.4+ KB

Storing Data¶

Storing data in a CSV file¶

In [83]:
twitter_archive_master.to_csv('twitter_archive_master.csv',index=False)

Storing data in a SQLite database¶

In [84]:
from sqlalchemy import create_engine
In [85]:
# Create SQLAlchemy Engine and empty twitter archive database
engine = create_engine('sqlite:///twitter_archive_master.db')
In [86]:
# Store cleaned master DataFrame ('twitter_archive_master') in a table called twitter_archive_master in twitter_archive_master.db
twitter_archive_master.to_sql('twitter_archive_master', engine, index=False)
Out[86]:
2087

Analyzing and Visualizing Data¶

In [87]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [88]:
twitter_archive_master = pd.read_csv('twitter_archive_master.csv')
twitter_archive_master
Out[88]:
tweet_id tweet_date text expanded_urls dog_stages Names Rating_numerator jpg_url img_num Image_prediction1 confirmation1 Dog_prediction1 Image_prediction2 confirmation2 Dog_prediction2 Image_prediction3 confirmation3 Dog_prediction3 favorite_count retweet_count
0 892420643555336193 2017-08-01 16:23:56+00:00 This is Phineas. He's a mystical boy. Only eve... https://twitter.com/dog_rates/status/892420643... NaN Phineas 13.0 https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1.0 Orange 0.097049 False Bagel 0.085851 False Banana 0.076110 False 39467 8853
1 892177421306343426 2017-08-01 00:17:27+00:00 This is Tilly. She's just checking pup on you.... https://twitter.com/dog_rates/status/892177421... NaN Tilly 13.0 https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1.0 Chihuahua 0.323581 True Pekinese 0.090647 True Papillon 0.068957 True 33819 6514
2 891815181378084864 2017-07-31 00:18:03+00:00 This is Archie. He is a rare Norwegian Pouncin... https://twitter.com/dog_rates/status/891815181... NaN Archie 12.0 https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1.0 Chihuahua 0.716012 True Malamute 0.078253 True Kelpie 0.031379 True 25461 4328
3 891689557279858688 2017-07-30 15:58:51+00:00 This is Darla. She commenced a snooze mid meal... https://twitter.com/dog_rates/status/891689557... NaN Darla 13.0 https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1.0 Paper Towel 0.170278 False Labrador Retriever 0.168086 True Spatula 0.040836 False 42908 8964
4 891327558926688256 2017-07-29 16:00:24+00:00 This is Franklin. He would like you to stop ca... https://twitter.com/dog_rates/status/891327558... NaN Franklin 12.0 https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2.0 Basset 0.555712 True English Springer 0.225770 True German Short-Haired Pointer 0.175219 True 41048 9774
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2082 666049248165822465 2015-11-16 00:24:50+00:00 Here we have a 1949 1st generation vulpix. Enj... https://twitter.com/dog_rates/status/666049248... NaN NaN 5.0 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1.0 Miniature Pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True 111 41
2083 666044226329800704 2015-11-16 00:04:52+00:00 This is a purebred Piers Morgan. Loves to Netf... https://twitter.com/dog_rates/status/666044226... NaN NaN 6.0 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1.0 Rhodesian Ridgeback 0.408143 True Redbone 0.360687 True Miniature Pinscher 0.222752 True 311 147
2084 666033412701032449 2015-11-15 23:21:54+00:00 Here is a very happy pup. Big fan of well-main... https://twitter.com/dog_rates/status/666033412... NaN NaN 9.0 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1.0 German Shepherd 0.596461 True Malinois 0.138584 True Bloodhound 0.116197 True 128 47
2085 666029285002620928 2015-11-15 23:05:30+00:00 This is a western brown Mitsubishi terrier. Up... https://twitter.com/dog_rates/status/666029285... NaN NaN 7.0 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1.0 Redbone 0.506826 True Miniature Pinscher 0.074192 True Rhodesian Ridgeback 0.072010 True 132 48
2086 666020888022790149 2015-11-15 22:32:08+00:00 Here we have a Japanese Irish Setter. Lost eye... https://twitter.com/dog_rates/status/666020888... NaN NaN 8.0 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1.0 Welsh Springer Spaniel 0.465074 True Collie 0.156665 True Shetland Sheepdog 0.061428 True 2535 532

2087 rows × 20 columns

Questions¶

  • Is there a correlation between the favorite count and the retweet count?
  • What are the top favored and rated dog stages ?
  • What are the top rated and retweeted dogs?

Correlation¶

In [89]:
correlation = twitter_archive_master["favorite_count"].corr(twitter_archive_master["retweet_count"])

correlation 
Out[89]:
0.9116032698106865

The correlation between the favorite_count and the retweet_count is around 0.9116. The number is closer to 1, which means these two parameters are positively correlated.

In [90]:
sns.color_palette()
fig1 = sns.regplot(x=twitter_archive_master["favorite_count"], y=twitter_archive_master["retweet_count"],
                   scatter_kws={"color": 'darkgreen'}, line_kws={"color": "red"})
fig1.set(xlabel='favorite_count', ylabel='retweet_count',
         title = "Correlation matrix of favorite count and retweet count data");

Favorite dog stages per rating¶

In [91]:
Top_rating_favorite_stages = pd.DataFrame(twitter_archive_master.groupby(['dog_stages','Rating_numerator']).sum()['favorite_count'].sort_values(ascending = False).head(10)).reset_index()
Top_rating_favorite_stages
Out[91]:
dog_stages Rating_numerator favorite_count
0 doggo 13.0 536789
1 pupper 12.0 449907
2 pupper 13.0 376697
3 puppo 13.0 351106
4 pupper 11.0 292288
5 doggo 14.0 256874
6 doggo 12.0 237380
7 doggo 11.0 188781
8 pupper 14.0 185428
9 pupper 10.0 172021
In [93]:
fig2 = px.bar(Top_rating_favorite_stages, x='dog_stages', y='favorite_count',title="Top favorite dog stages per rating",
              text_auto='.3s', 
            color="Rating_numerator",)
fig2.update_layout(xaxis={'categoryorder':'total descending',},
                  yaxis={'visible': False, 'showticklabels': False},
                  xaxis_title=None)
fig2.update_traces(textfont_size=15, textangle=0, 
                    cliponaxis=False
                  )

Top retweeted dogs names per rating¶

In [94]:
Top_rating_retweeted_Names = pd.DataFrame(twitter_archive_master.groupby(['Names','Rating_numerator','dog_stages']).sum()['retweet_count'].sort_values(ascending = False).head(10)).reset_index()
Top_rating_retweeted_Names
Out[94]:
Names Rating_numerator dog_stages retweet_count
0 Bo 14.0 doggo 42228
1 Jamesy 13.0 pupper 32883
2 Sunny 14.0 doggo 21794
3 Gabe 14.0 pupper 17209
4 Barney 13.0 doggo 13076
5 Cupid 13.0 doggo 11878
6 Sebastian 13.0 puppo 11007
7 Astrid 13.0 doggo 10706
8 Reginald 12.0 puppo 9374
9 Cassie 14.0 doggo 7711
In [95]:
fig3 = px.bar(Top_rating_retweeted_Names, x='Names', y='retweet_count',title="Top retweeted Names per rating",
              text_auto='.3s', 
            color="Rating_numerator",)
fig3.update_layout(#xaxis={'categoryorder':'total descending',},
                  yaxis={'visible': False, 'showticklabels': False},
                  xaxis_title=None)
fig3.update_traces(textfont_size=15, textangle=0,
                   textposition="outside",
                    cliponaxis=False
                  )
In [96]:
Bo = twitter_archive_master.query('Names == "Bo" & Rating_numerator == 14')
Bo
Out[96]:
tweet_id tweet_date text expanded_urls dog_stages Names Rating_numerator jpg_url img_num Image_prediction1 confirmation1 Dog_prediction1 Image_prediction2 confirmation2 Dog_prediction2 Image_prediction3 confirmation3 Dog_prediction3 favorite_count retweet_count
349 819004803107983360 2017-01-11 02:15:36+00:00 This is Bo. He was a very good First Doggo. 14... https://twitter.com/dog_rates/status/819004803... doggo Bo 14.0 https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg 1.0 Standard Poodle 0.351308 True Toy Poodle 0.271929 True Tibetan Terrier 0.094759 True 95450 42228
In [97]:
from IPython.display import Image
Image(url= "https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg", width=500, height=300)
Out[97]:
In [ ]: